import polars as plimport numpy as npfrom lets_plot import*from sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import OneHotEncoderfrom sklearn.ensemble import GradientBoostingClassifierfrom sklearn.metrics import ( classification_report, accuracy_score, recall_score, precision_score, f1_score )# add the additional libraries you need to import for ML hereLetsPlot.setup_html(isolated_frame=True)
Show the code
# import your data here using pandas and the URLurl ="https://github.com/fivethirtyeight/data/raw/master/star-wars-survey/StarWars.csv"df = pl.read_csv("StarWars.csv")df_clean = df.rename({ df.columns[1]: "seen", df.columns[2]: "fan",**{df.columns[i]: f"seen_epi_{['i', 'ii', 'iii', 'iv', 'v', 'vi'][i -3]}"for i inrange(3, 9)},**{df.columns[i]: f"rank_epi_{['i', 'ii', 'iii', 'iv', 'v', 'vi'][i -9]}"for i inrange(9, 15)},**{df.columns[i]: df[df.columns[i]][0].lower().replace(' ', '_') for i inrange(15, 29)}, df.columns[29]: "shot_first", df.columns[30]: "ex_uni", df.columns[31]: "fan_ex_uni", df.columns[32]: "fan_star_trek",**{df.columns[i]: df.columns[i].lower().replace(' ', '_') for i inrange(33, 37)}, df.columns[37]: "location"})df_clean = df_clean[1:]
# Helper functions for obtaining graph specific DataFramesseen_cols = ["seen_epi_i_yes", "seen_epi_ii_yes", "seen_epi_iii_yes", "seen_epi_iv_yes", "seen_epi_v_yes", "seen_epi_vi_yes"]rank_cols = ["rank_epi_i", "rank_epi_ii", "rank_epi_iii", "rank_epi_iv", "rank_epi_v", "rank_epi_vi",]movies = ["The Phantom Menace", "Attack of the Clones", "Revenge of the Sith", "A New Hope", "The Empire Strikes Back", "Return of the Jedi"]def GetSeen(data): data_seen = data.filter( pl.any_horizontal([pl.col(i) ==1for i in seen_cols])).select(seen_cols) seen_counts = [data_seen[col].value_counts().sort(col)["count"][1] for col in seen_cols] seen_percs = [i / data_seen.shape[0] for i in seen_counts] df_percs = pl.DataFrame({"movie": movies[::-1], "percentage": seen_percs[::-1]}) df_percs = df_percs.with_columns( ((pl.col("percentage") *100).round(0).cast(pl.Int64).cast(pl.String) +'%').alias('perc_label') )return df_percsdef GetRanks(data): data_rank = data.filter( pl.all_horizontal([pl.col(i) >0for i in seen_cols]) ).select(rank_cols)# Episode 3 has 1 missing rank (when compared to the other movies the rank option left is 6) data_rank = data_rank.with_columns(pl.col('rank_epi_iii').replace("0", "6")) rank_counts = [data_rank.filter(pl.col(i) =="1").height for i in rank_cols] rank_percs = [i / data_rank.height for i in rank_counts] data_rank_percs = pl.DataFrame({"movie": movies[::-1], "percentage": rank_percs[::-1]}) data_rank_percs = data_rank_percs.with_columns( ((pl.col("percentage") *100).round(0).cast(pl.Int64).cast(pl.String) +'%').alias('perc_label') )return data_rank_percsdef GetRatings(data): data_ratings = data.filter( pl.all_horizontal([pl.col(i) >0for i in seen_cols]) ).select(rank_cols) data_ratings = data_ratings.with_columns(pl.col('rank_epi_iii').replace("0", "6")) thirds = data_ratings.with_columns( pl.col(i).replace_strict({"1": 1,"2": 1,"3": 2,"4": 2,"5": 3,"6": 3 }, default=0).alias(i) for i in data_ratings.columns ) third_counts = thirds.unpivot(variable_name="movie").group_by(['movie', 'value']).count().pivot('value', index='movie').sort('movie') third_counts = third_counts.with_columns( movie=pl.Series(movies) ) third_percs = third_counts.with_columns( [(pl.col(third_counts.columns[i]) / thirds.height).alias(third_counts.columns[i]) for i inrange(1, 4)] ) third_percs = third_percs.rename({"1": 'Top third', "2": 'Middle third', "3": 'Bottom third'})# third_percs = third_percs.with_columns(# ((pl.col(third_percs.columns[i]) * 100).round(0).cast(pl.Int64).cast(pl.String) + '%').alias(f"{third_percs.columns[i]}_label") for i in range(1, 4)# ) third_long = third_percs.unpivot(index='movie', variable_name='rating', value_name='percentage')# third_long = third_long.with_columns(# pl.col('rating').replace({# "1": 'Top third',# "2": 'Middle third',# "3": 'Bottom third'# })# ) third_long = third_long.with_columns( ((pl.col("percentage") *100).round(0).cast(pl.Int64).cast(pl.String) +'%').alias('perc_label') )return third_long
QUESTION 1
Build a machine learning model that predicts whether a person makes at least $50k with accuracy of at least 65%. Describe your model and report the accuracy.
Describe your model and report the accuracy.
Show the code
# Include and execute your code herernd =343X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, random_state=rnd, test_size=0.2)model = GradientBoostingClassifier()model.fit(X_trn, y_trn)pred = model.predict(X_tst)print(classification_report(y_tst, pred))